# load raw data files
data <- read.csv("../data/filledDatabase.csv")[,-c(2:9,11:13)]

# clean data 
data <- clean_data(data) %>% collapse_data()

# separate compound and group_cate from the predictors
compound <- data$Compound
group_cat <- data$GroupCat
space_group <- data$SpaceGroup

# create data constructed by first 13 PC's
data <- select(data, -c("Compound","X","Z","SpaceGroup","SpaceGroupNumber"))
# data_pca <- get_pc_space(data[,-1], k = 13) %>% scale() %>% data.frame()

# split data into 5 folds for cross validation later
folds <- caret::createFolds(1:nrow(data), k = 5, list = TRUE, returnTrain = FALSE)

Multinomial Regression

library(glmnet)
X = data[,-1] %>% as.matrix()
Y = data$GroupCat %>% as.matrix()

Shrinkage

Ridge

model_ridge <- glmnet(x = X, y = Y, alpha = 0, family = "multinomial")
plot(model_ridge, xvar = "lambda", label = TRUE)

LASSO

model_lasso <- glmnet(x = X, y = Y, alpha = 1, family = "multinomial")
plot(model_lasso, xvar = "lambda", label = TRUE)

Coefficient

Ridge

ridge_cv <- cv.glmnet(x = X, y = Y, alpha = 0, nfolds = 5, type.measure = "deviance", family = "multinomial") 
ridge_cv %>% 
  get_coef(tuning_parameter = ridge_cv$lambda.min) %>% 
  select(feature, Cubic, Tilted, Others) %>% 
  filter(feature != "(Intercept)") %>% 
  plot_coef()

LASSO

lasso_cv <- cv.glmnet(x = X, y = Y, alpha = 1, nfolds = 5, type.measure = "deviance", family = "multinomial")
lasso_cv %>% 
  get_coef(tuning_parameter = lasso_cv$lambda.min) %>% 
  select(feature, Cubic, Tilted, Others) %>% 
  filter(feature != "(Intercept)") %>% 
  plot_coef()

Elastic Net

library(caret)
elastic_cv <- 
  train(GroupCat ~., data = data, method = "glmnet",
    trControl = trainControl("cv", number = 5),
    tuneLength = 10
    )
elastic_cv$finalModel %>% 
  get_coef(tuning_parameter = elastic_cv$bestTune$lambda) %>% 
  select(feature, Cubic, Tilted, Others) %>% 
  filter(feature != "(Intercept)") %>% 
  plot_coef()

Accurate classification rate

Ridge

tb_ridge = prediction_table(alpha = 0, lambda = ridge_cv$lambda.min) 
tb_ridge$r %>% print_accurate_tb()
Fold1 Fold2 Fold3 Fold4 Fold5 Mean
0.9466667 0.92 0.890411 0.9594595 0.8918919 0.9216858
tb_ridge$t %>% highlight_tb_count()
Cubic Others Tilted
Cubic 171 3 11
Others 2 29 2
Tilted 6 5 142
Total 179 37 155
tb_ridge$t %>% highlight_tb_percent()
Cubic Others Tilted
Cubic 0.96 0.08 0.07
Others 0.01 0.78 0.01
Tilted 0.03 0.14 0.92
Total 100% 100% 100%
tb_ridge$t %>% 
  as.data.frame() %>% 
  arrange(desc(Freq))
##     Var1   Var2 Freq
## 1  Cubic  Cubic  171
## 2 Tilted Tilted  142
## 3 Others Others   29
## 4  Cubic Tilted   11
## 5 Tilted  Cubic    6
## 6 Tilted Others    5
## 7  Cubic Others    3
## 8 Others  Cubic    2
## 9 Others Tilted    2

LASSO

tb_lasso = prediction_table(alpha = 1, lambda = lasso_cv$lambda.min) 
tb_lasso$r %>% print_accurate_tb()
Fold1 Fold2 Fold3 Fold4 Fold5 Mean
0.9466667 0.9466667 0.9315068 0.9324324 0.972973 0.9460491
tb_lasso$t %>% highlight_tb_count() 
Cubic Others Tilted
Cubic 177 3 7
Others 1 29 3
Tilted 1 5 145
Total 179 37 155
tb_lasso$t %>% highlight_tb_percent()
Cubic Others Tilted
Cubic 0.99 0.08 0.05
Others 0.01 0.78 0.02
Tilted 0.01 0.14 0.94
Total 100% 100% 100%

Elastic Net

tb_elastic = prediction_table(alpha = elastic_cv$bestTune[[1]], lambda = elastic_cv$bestTune[[2]]) 
tb_elastic$r %>% print_accurate_tb()
Fold1 Fold2 Fold3 Fold4 Fold5 Mean
0.9466667 0.96 0.9589041 0.9324324 0.9594595 0.9514925
tb_elastic$t %>% highlight_tb_count() 
Cubic Others Tilted
Cubic 176 2 3
Others 2 29 4
Tilted 1 6 148
Total 179 37 155
tb_elastic$t %>% highlight_tb_percent()
Cubic Others Tilted
Cubic 0.98 0.05 0.02
Others 0.01 0.78 0.03
Tilted 0.01 0.16 0.95
Total 100% 100% 100%